{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Step 4: Feature Engineering\n",
    "\n",
    "Use the code below to run TensorFlow Transform on some example data using the schema from your pipeline. Start by importing and opening the metadata store."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tmy/anaconda3/envs/tfx-env/lib/python3.6/site-packages/apache_beam/__init__.py:84: UserWarning: Running the Apache Beam SDK on Python 3 is not yet fully supported. You may encounter buggy behavior or missing features.\n",
      "  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pipeline DB:\n",
      "/home/tmy/airflow/tfx/metadata/taxi/metadata.db\n"
     ]
    }
   ],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import os\n",
    "import tempfile\n",
    "import pandas as pd\n",
    "\n",
    "import tensorflow as tf\n",
    "import tensorflow_transform as tft\n",
    "from tensorflow_transform import beam as tft_beam\n",
    "import tfx_utils\n",
    "from tfx.utils import io_utils\n",
    "from tensorflow_metadata.proto.v0 import schema_pb2\n",
    "\n",
    "# For DatasetMetadata boilerplate\n",
    "from tensorflow_transform.tf_metadata import dataset_metadata\n",
    "from tensorflow_transform.tf_metadata import dataset_schema\n",
    "from tensorflow_transform.tf_metadata import schema_utils\n",
    "\n",
    "def _make_default_sqlite_uri(pipeline_name):\n",
    "    return os.path.join(os.environ['HOME'], 'airflow/tfx/metadata', pipeline_name, 'metadata.db')\n",
    "\n",
    "def get_metadata_store(pipeline_name):\n",
    "    return tfx_utils.TFXReadonlyMetadataStore.from_sqlite_db(_make_default_sqlite_uri(pipeline_name))\n",
    "\n",
    "pipeline_name = 'taxi'\n",
    "\n",
    "pipeline_db_path = _make_default_sqlite_uri(pipeline_name)\n",
    "print('Pipeline DB:\\n{}'.format(pipeline_db_path))\n",
    "\n",
    "store = get_metadata_store(pipeline_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the schema URI from the metadata store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Schema URI:\n",
      "/home/tmy/airflow/tfx/pipelines/taxi/SchemaGen/output/3/schema.pbtxt\n"
     ]
    }
   ],
   "source": [
    "# Get the schema URI from the metadata store\n",
    "schemas = store.get_artifacts_of_type_df(tfx_utils.TFXArtifactTypes.SCHEMA)\n",
    "assert len(schemas.URI) == 1\n",
    "schema_uri = schemas.URI.iloc[0] + 'schema.pbtxt'\n",
    "print ('Schema URI:\\n{}'.format(schema_uri))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the schema that was inferred by TensorFlow Data Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "schema_proto = io_utils.parse_pbtxt_file(file_name=schema_uri, message=schema_pb2.Schema())\n",
    "feature_spec, domains = schema_utils.schema_as_feature_spec(schema_proto)\n",
    "legacy_metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(feature_spec, domains))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define features and create functions for TensorFlow Transform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Categorical features are assumed to each have a maximum value in the dataset.\n",
    "_MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]\n",
    "\n",
    "_CATEGORICAL_FEATURE_KEYS = [\n",
    "    'trip_start_hour', 'trip_start_day', 'trip_start_month',\n",
    "    'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',\n",
    "    'dropoff_community_area'\n",
    "]\n",
    "\n",
    "_DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']\n",
    "\n",
    "# Number of buckets used by tf.transform for encoding each feature.\n",
    "_FEATURE_BUCKET_COUNT = 10\n",
    "\n",
    "_BUCKET_FEATURE_KEYS = [\n",
    "    'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',\n",
    "    'dropoff_longitude'\n",
    "]\n",
    "\n",
    "# Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform\n",
    "_VOCAB_SIZE = 1000\n",
    "\n",
    "# Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.\n",
    "_OOV_SIZE = 10\n",
    "\n",
    "_VOCAB_FEATURE_KEYS = [\n",
    "    'payment_type',\n",
    "    'company',\n",
    "]\n",
    "\n",
    "# Keys\n",
    "_LABEL_KEY = 'tips'\n",
    "_FARE_KEY = 'fare'\n",
    "\n",
    "\n",
    "def _transformed_name(key):\n",
    "  return key + '_xf'\n",
    "\n",
    "def _transformed_names(keys):\n",
    "  return [_transformed_name(key) for key in keys]\n",
    "\n",
    "def _fill_in_missing(x):\n",
    "  \"\"\"Replace missing values in a SparseTensor.\n",
    "\n",
    "  Fills in missing values of `x` with '' or 0, and converts to a dense tensor.\n",
    "\n",
    "  Args:\n",
    "    x: A `SparseTensor` of rank 2.  Its dense shape should have size at most 1\n",
    "      in the second dimension.\n",
    "\n",
    "  Returns:\n",
    "    A rank 1 tensor where missing values of `x` have been filled in.\n",
    "  \"\"\"\n",
    "  default_value = '' if x.dtype == tf.string else 0\n",
    "  return tf.squeeze(\n",
    "      tf.sparse_to_dense(x.indices, [x.dense_shape[0], 1], x.values,\n",
    "                         default_value),\n",
    "      axis=1)\n",
    "\n",
    "def preprocessing_fn(inputs):\n",
    "  \"\"\"tf.transform's callback function for preprocessing inputs.\n",
    "\n",
    "  Args:\n",
    "    inputs: map from feature keys to raw not-yet-transformed features.\n",
    "\n",
    "  Returns:\n",
    "    Map from string feature key to transformed feature operations.\n",
    "  \"\"\"\n",
    "  outputs = {}\n",
    "  for key in _DENSE_FLOAT_FEATURE_KEYS:\n",
    "    # Preserve this feature as a dense float, setting nan's to the mean.\n",
    "    outputs[_transformed_name(key)] = tft.scale_to_z_score(\n",
    "        _fill_in_missing(inputs[key]))\n",
    "\n",
    "  for key in _VOCAB_FEATURE_KEYS:\n",
    "    # Build a vocabulary for this feature.\n",
    "    outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(\n",
    "        _fill_in_missing(inputs[key]),\n",
    "        top_k=_VOCAB_SIZE,\n",
    "        num_oov_buckets=_OOV_SIZE)\n",
    "\n",
    "  for key in _BUCKET_FEATURE_KEYS:\n",
    "    outputs[_transformed_name(key)] = tft.bucketize(\n",
    "        _fill_in_missing(inputs[key]), _FEATURE_BUCKET_COUNT)\n",
    "\n",
    "  for key in _CATEGORICAL_FEATURE_KEYS:\n",
    "    outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])\n",
    "\n",
    "  # Was this passenger a big tipper?\n",
    "  taxi_fare = _fill_in_missing(inputs[_FARE_KEY])\n",
    "  tips = _fill_in_missing(inputs[_LABEL_KEY])\n",
    "  outputs[_transformed_name(_LABEL_KEY)] = tf.where(\n",
    "      tf.is_nan(taxi_fare),\n",
    "      tf.cast(tf.zeros_like(taxi_fare), tf.int64),\n",
    "      # Test if the tip was > 20% of the fare.\n",
    "      tf.cast(\n",
    "          tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64))\n",
    "\n",
    "  return outputs\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Display the results of transforming some example data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From <ipython-input-5-d1598abb6e5e>:57: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: Logging before flag parsing goes to stderr.\n",
      "W0705 10:59:05.511427 140414483334912 deprecation.py:323] From <ipython-input-5-d1598abb6e5e>:57: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/tmy/anaconda3/envs/tfx-env/lib/python3.6/site-packages/tensorflow_transform/mappers.py:1027: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.cast instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0705 10:59:05.795439 140414483334912 deprecation.py:323] From /home/tmy/anaconda3/envs/tfx-env/lib/python3.6/site-packages/tensorflow_transform/mappers.py:1027: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.cast instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/tmy/anaconda3/envs/tfx-env/lib/python3.6/site-packages/tensorflow/python/saved_model/signature_def_utils_impl.py:205: build_tensor_info (from tensorflow.python.saved_model.utils_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0705 10:59:06.010318 140414483334912 deprecation.py:323] From /home/tmy/anaconda3/envs/tfx-env/lib/python3.6/site-packages/tensorflow/python/saved_model/signature_def_utils_impl.py:205: build_tensor_info (from tensorflow.python.saved_model.utils_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets added to graph.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:06.017235 140414483334912 builder_impl.py:654] Assets added to graph.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:No assets to write.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:06.018948 140414483334912 builder_impl.py:449] No assets to write.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:SavedModel written to: /tmp/tmpmd6vztk2/tftransform_tmp/d1f6d4a03ad643849814e4c5f3b857ef/saved_model.pb\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:06.081606 140414483334912 builder_impl.py:414] SavedModel written to: /tmp/tmpmd6vztk2/tftransform_tmp/d1f6d4a03ad643849814e4c5f3b857ef/saved_model.pb\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets added to graph.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:09.007880 140414483334912 builder_impl.py:654] Assets added to graph.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:No assets to write.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:09.009715 140414483334912 builder_impl.py:449] No assets to write.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:SavedModel written to: /tmp/tmpmd6vztk2/tftransform_tmp/c1089b9810b644e88089e26ef4bfe4fa/saved_model.pb\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:09.042866 140414483334912 builder_impl.py:414] SavedModel written to: /tmp/tmpmd6vztk2/tftransform_tmp/c1089b9810b644e88089e26ef4bfe4fa/saved_model.pb\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:16.053239 140414483334912 saver.py:1483] Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:20.210193 140414483334912 saver.py:1483] Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets added to graph.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:20.240610 140414483334912 builder_impl.py:654] Assets added to graph.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Assets written to: /tmp/tmpmd6vztk2/tftransform_tmp/c73ddda8e0324f1c8b0436ea4237ce44/assets\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:20.244335 140414483334912 builder_impl.py:763] Assets written to: /tmp/tmpmd6vztk2/tftransform_tmp/c73ddda8e0324f1c8b0436ea4237ce44/assets\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:SavedModel written to: /tmp/tmpmd6vztk2/tftransform_tmp/c73ddda8e0324f1c8b0436ea4237ce44/saved_model.pb\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:20.301272 140414483334912 builder_impl.py:414] SavedModel written to: /tmp/tmpmd6vztk2/tftransform_tmp/c73ddda8e0324f1c8b0436ea4237ce44/saved_model.pb\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_10:0\\022-vocab_compute_and_apply_vocabulary_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0705 10:59:20.664853 140414483334912 ops.py:6153] Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_10:0\\022-vocab_compute_and_apply_vocabulary_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_11:0\\022/vocab_compute_and_apply_vocabulary_1_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0705 10:59:20.666405 140414483334912 ops.py:6153] Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_11:0\\022/vocab_compute_and_apply_vocabulary_1_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:20.668937 140414483334912 saver.py:1483] Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_10:0\\022-vocab_compute_and_apply_vocabulary_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0705 10:59:20.912867 140414483334912 ops.py:6153] Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_10:0\\022-vocab_compute_and_apply_vocabulary_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_11:0\\022/vocab_compute_and_apply_vocabulary_1_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "W0705 10:59:20.914435 140414483334912 ops.py:6153] Expected binary or unicode string, got type_url: \"type.googleapis.com/tensorflow.AssetFileDef\"\n",
      "value: \"\\n\\014\\n\\nConst_11:0\\022/vocab_compute_and_apply_vocabulary_1_vocabulary\"\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0705 10:59:20.917026 140414483334912 saver.py:1483] Saver not created because there are no variables in the graph to restore\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>company</th>\n",
       "      <th>dropoff_census_tract</th>\n",
       "      <th>dropoff_community_area</th>\n",
       "      <th>dropoff_latitude</th>\n",
       "      <th>dropoff_longitude</th>\n",
       "      <th>fare</th>\n",
       "      <th>payment_type</th>\n",
       "      <th>pickup_census_tract</th>\n",
       "      <th>pickup_community_area</th>\n",
       "      <th>pickup_latitude</th>\n",
       "      <th>pickup_longitude</th>\n",
       "      <th>tips</th>\n",
       "      <th>trip_miles</th>\n",
       "      <th>trip_seconds</th>\n",
       "      <th>trip_start_day</th>\n",
       "      <th>trip_start_hour</th>\n",
       "      <th>trip_start_month</th>\n",
       "      <th>trip_start_timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[taxi inc.]</td>\n",
       "      <td>[12345.0]</td>\n",
       "      <td>[123]</td>\n",
       "      <td>[80.01]</td>\n",
       "      <td>[12.05]</td>\n",
       "      <td>[100.0]</td>\n",
       "      <td>[visa]</td>\n",
       "      <td>[abcd]</td>\n",
       "      <td>[123]</td>\n",
       "      <td>[80.0]</td>\n",
       "      <td>[12.0]</td>\n",
       "      <td>[10.0]</td>\n",
       "      <td>[8.0]</td>\n",
       "      <td>[600.0]</td>\n",
       "      <td>[12]</td>\n",
       "      <td>[12]</td>\n",
       "      <td>[5]</td>\n",
       "      <td>[123456]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       company dropoff_census_tract dropoff_community_area dropoff_latitude  \\\n",
       "0  [taxi inc.]            [12345.0]                  [123]          [80.01]   \n",
       "\n",
       "  dropoff_longitude     fare payment_type pickup_census_tract  \\\n",
       "0           [12.05]  [100.0]       [visa]              [abcd]   \n",
       "\n",
       "  pickup_community_area pickup_latitude pickup_longitude    tips trip_miles  \\\n",
       "0                 [123]          [80.0]           [12.0]  [10.0]      [8.0]   \n",
       "\n",
       "  trip_seconds trip_start_day trip_start_hour trip_start_month  \\\n",
       "0      [600.0]           [12]            [12]              [5]   \n",
       "\n",
       "  trip_start_timestamp  \n",
       "0             [123456]  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>company_xf</th>\n",
       "      <th>dropoff_census_tract_xf</th>\n",
       "      <th>dropoff_community_area_xf</th>\n",
       "      <th>dropoff_latitude_xf</th>\n",
       "      <th>dropoff_longitude_xf</th>\n",
       "      <th>fare_xf</th>\n",
       "      <th>payment_type_xf</th>\n",
       "      <th>pickup_census_tract_xf</th>\n",
       "      <th>pickup_community_area_xf</th>\n",
       "      <th>pickup_latitude_xf</th>\n",
       "      <th>pickup_longitude_xf</th>\n",
       "      <th>tips_xf</th>\n",
       "      <th>trip_miles_xf</th>\n",
       "      <th>trip_seconds_xf</th>\n",
       "      <th>trip_start_day_xf</th>\n",
       "      <th>trip_start_hour_xf</th>\n",
       "      <th>trip_start_month_xf</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>12345.0</td>\n",
       "      <td>123.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>b'abcd'</td>\n",
       "      <td>123</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12</td>\n",
       "      <td>12</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   company_xf  dropoff_census_tract_xf  dropoff_community_area_xf  \\\n",
       "0           0                  12345.0                      123.0   \n",
       "\n",
       "   dropoff_latitude_xf  dropoff_longitude_xf  fare_xf  payment_type_xf  \\\n",
       "0                    1                     1      0.0                0   \n",
       "\n",
       "  pickup_census_tract_xf  pickup_community_area_xf  pickup_latitude_xf  \\\n",
       "0                b'abcd'                       123                   1   \n",
       "\n",
       "   pickup_longitude_xf  tips_xf  trip_miles_xf  trip_seconds_xf  \\\n",
       "0                    1        0            0.0              0.0   \n",
       "\n",
       "   trip_start_day_xf  trip_start_hour_xf  trip_start_month_xf  \n",
       "0                 12                  12                    5  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.display import display\n",
    "with tft_beam.Context(temp_dir=tempfile.mkdtemp()):\n",
    "    raw_examples = [\n",
    "        {\n",
    "            \"fare\": [100.0],\n",
    "            \"trip_start_hour\": [12],\n",
    "            \"pickup_census_tract\": ['abcd'],\n",
    "            \"dropoff_census_tract\": [12345.0],  # No idea why this is a float\n",
    "            \"company\": ['taxi inc.'],\n",
    "            \"trip_start_timestamp\": [123456],\n",
    "            \"pickup_longitude\": [12.0],\n",
    "            \"trip_start_month\": [5],\n",
    "            \"trip_miles\": [8.0],\n",
    "            \"dropoff_longitude\": [12.05],\n",
    "            \"dropoff_community_area\": [123],\n",
    "            \"pickup_community_area\": [123],\n",
    "            \"payment_type\": ['visa'],\n",
    "            \"trip_seconds\": [600.0],\n",
    "            \"trip_start_day\": [12],\n",
    "            \"tips\": [10.0],\n",
    "            \"pickup_latitude\": [80.0],\n",
    "            \"dropoff_latitude\": [80.01],\n",
    "        }\n",
    "    ]\n",
    "    (transformed_examples, transformed_metadata), transform_fn = (\n",
    "        (raw_examples, legacy_metadata)\n",
    "        | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(\n",
    "            preprocessing_fn))\n",
    "    display(pd.DataFrame(raw_examples))\n",
    "    display(pd.DataFrame(transformed_examples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
